#! pip install --upgrade ppscore
#! pip install --upgrade plotly
#! pip install --upgrade nbformat
#! pip install --upgrade pdpbox
#! pip install --upgrade shap
import gc, pickle
from copy import deepcopy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import ppscore as pps
import shap, optuna
from pdpbox import pdp
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.inspection import permutation_importance
rmse = lambda true, pred: mse(true, pred) ** 0.5
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.decomposition import PCA
SEED = 2
df = pd.read_pickle('berlin_housing_cleaned.pkl').drop(['url', 'security_deposit'], 1)
TARGET = 'rent'
VALID_SZ = 0.1
CAT_COLS = df.columns[df.dtypes == object].tolist()
CONT_COLS = [c for c in df.columns if c not in CAT_COLS and c != TARGET]
df = deepcopy(df[CAT_COLS + CONT_COLS + [TARGET]])
df.shape
df.isnull().sum().sum()
df.head()
pps_matrix = pps.matrix(df)
# heatmap
fig = go.Figure(
data = go.Heatmap(
z = pps_matrix.values,
x = pps_matrix.index,
y = pps_matrix.columns
)
)
fig.update_layout(title_text = 'Predictive Power Score Heatmap')
fig.show()
for col in CAT_COLS:
df[col] = LabelEncoder().fit_transform(df[col])
def get_train_test_split(df, x_cols, target, test_size, random_state=SEED):
x = deepcopy(df[x_cols])
y = deepcopy(df[[target]])
return train_test_split(x, y, test_size=test_size, random_state=random_state)
train_x, valid_x, train_y, valid_y = get_train_test_split(
df=df,
x_cols=CONT_COLS + CAT_COLS,
target=TARGET,
test_size = VALID_SZ
)
train_x.shape, valid_x.shape, train_y.shape, valid_y.shape
train_y = deepcopy(train_y.values.ravel())
valid_y = deepcopy(valid_y.values.ravel())
def get_error(pred, true):
print(f'RMSE: {rmse(true, pred)}')
print(f'MAE: {mae(true, pred)}')
# naive baseline (prediction is average of training data)
get_error(pred = np.array([np.mean(train_y) for _ in range(len(valid_y))]),
true = valid_y)
# naive baseline (prediction is median of training data)
get_error(pred = np.array([np.median(train_y) for _ in range(len(valid_y))]),
true = valid_y)
model = RandomForestRegressor()
model.fit(train_x, train_y)
preds = model.predict(valid_x)
get_error(valid_y, preds)
The three most important paramaters that should be tuned are:
Lets tune each feature separately below, starting with n_estimators:
def fit_and_eval(train_x, train_y, valid_x, valid_y,
n_estimators=100, max_features='auto', max_depth=None, n_jobs=-1):
model = RandomForestRegressor(
n_estimators=n_estimators,
max_features=max_features,
max_depth=max_depth,
n_jobs=n_jobs
)
model.fit(train_x, train_y)
preds = model.predict(valid_x)
return rmse(valid_y, preds)
def iter_hyperparam_values(train_x, train_y, valid_x, valid_y, test_param, test_param_values,
optimal_values=None, fit_iter=10):
# store the error after each iteration
errors = {'avg': [], 'std': [], 'min': []}
hyperparams = {**(optimal_values if optimal_values else {})}
# iterate through each of the paramater values and store the error
for test_param_value in test_param_values:
hyperparams[test_param] = test_param_value
_errors = [fit_and_eval(train_x, train_y, valid_x, valid_y, **hyperparams) for _ in range(fit_iter)]
avg_error = np.mean(_errors)
std_error = np.std(_errors)
min_error = min(_errors)
errors['avg'].append(avg_error)
errors['std'].append(std_error)
errors['min'].append(min_error)
print(f'With {test_param_value} {test_param}, Average Model Error of: {avg_error:.2f} (+/- {std_error:.2f})')
return errors
def plot_errors(errors:dict, xaxis_title:str, norm_values:bool):
plot_errors = deepcopy({
error_metric: (
MinMaxScaler().fit_transform(np.array(errors[error_metric]).reshape(-1, 1)).ravel()
if norm_values else errors[error_metric]
) for error_metric in ['avg', 'std', 'min']
})
fig = go.Figure(data = [
go.Scatter(x=list(test_param_values), y=plot_errors['avg'], line_shape='spline', name='Average Error'),
go.Scatter(x=list(test_param_values), y=plot_errors['std'], line_shape='spline', name='Standard Deviation'),
go.Scatter(x=list(test_param_values), y=plot_errors['min'], line_shape='spline', name='Best Fit'),
])
fig.update_layout(xaxis_title=xaxis_title, yaxis_title='Metric Value')
return fig
test_param = 'n_estimators'
test_param_values = range(10, 310, 10)
optimal_values = None
fit_iter = 10
errors = iter_hyperparam_values(train_x, train_y, valid_x, valid_y,
test_param, test_param_values, optimal_values, fit_iter)
plot_errors(errors, xaxis_title='Number of Estimators', norm_values=True)
#OPTIMAL_N_ESTIMATORS = ## ENTER OPTIMAL VALUE HERE
OPTIMAL_N_ESTIMATORS = 220
max_features:¶test_param = 'max_features'
test_param_values = ['auto', 'sqrt', 'log2']
optimal_values = {'n_estimators': OPTIMAL_N_ESTIMATORS}
fit_iter = 10
errors = iter_hyperparam_values(train_x, train_y, valid_x, valid_y,
test_param, test_param_values, optimal_values, fit_iter)
plot_errors(errors, xaxis_title='Max Features', norm_values=True)
OPTIMAL_MAX_FEATURES = 'auto'
max_depth¶test_param = 'max_depth'
test_param_values = range(2, 32, 2)
optimal_values = {'n_estimators': OPTIMAL_N_ESTIMATORS, 'max_features': OPTIMAL_MAX_FEATURES}
fit_iter = 10
errors = iter_hyperparam_values(train_x, train_y, valid_x, valid_y,
test_param, test_param_values, optimal_values, fit_iter)
plot_errors(errors, xaxis_title='Max Depth', norm_values=True)
OPTIMAL_MAX_DEPTH = 28
best_rf = RandomForestRegressor(n_estimators = OPTIMAL_N_ESTIMATORS,
max_features = OPTIMAL_MAX_FEATURES,
max_depth = OPTIMAL_MAX_DEPTH
)
best_rf.fit(train_x, train_y)
preds = best_rf.predict(valid_x)
get_error(valid_y, preds)
For Random Forests, you can also tune:
bootstrap: Whether bootstrap samples are used when building trees.max_leaf_nodes: Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.min_impurity_decrease: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.min_samples_leaf: The minimum number of samples required to be at a leaf node.min_samples_split: The minimum number of samples required to split an internal node.min_weight_fraction_leaf: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.Yup! Introducing Random Search & Grid Search...
Using a search space that you provide, sklearn will randomly sample parameter sets and evaluate their performance.
# Create the random grid
random_grid = {
'n_estimators': [100, 200, 300, 400],
'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
'max_depth': [None, 1, 3, 5, 10, 15, 20, 25, 50],
'min_samples_split': [2, 4, 6, 8, 10],
'min_samples_leaf': [1, 2, 4, 6, 8, 10],
'bootstrap': [True],
}
# calculate number of possible parameter combinations
for i, n in enumerate([len(v) for v in random_grid.values()]):
if i+1 == 1: m = n
else: m *= n
print(f'Number of Combinations: {m}')
# use the random grid to search for best hyperparameters
rf_random = RandomizedSearchCV(
estimator = RandomForestRegressor(), # the sklearn model to use
param_distributions = random_grid, # the search space (specified above)
n_iter = 5000, # search across n combinations
cv = 5, # k fold cross validation
verbose = 2, # print out progress
n_jobs = -1 # how many CPU cores to use, if -1 use all available
)
# Fit the random search model
rf_random.fit(df[CONT_COLS + CAT_COLS].values, df[[TARGET]].values.ravel())
# run this to get best parameters in search
#rf_random_best_params = rf_random.best_params_
# best params from prev search
rf_random_best_params = {
'n_estimators': 100,
'min_samples_split': 2,
'min_samples_leaf': 1,
'max_features': 4,
'max_depth': 20,
'bootstrap': True,
'n_jobs': -1,
}
# fit and predict
best_rf = RandomForestRegressor(**rf_random_best_params)
best_rf.fit(train_x, train_y)
preds = best_rf.predict(valid_x)
get_error(valid_y, preds)
# Create the parameter grid based on the results of random search
param_grid = {
'n_estimators': [],
'min_samples_split': [],
'min_samples_leaf': [],
'max_features': [],
'max_depth': [],
'bootstrap': [],
}
# init new rf instance
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf,
param_grid = param_grid,
cv = 5,
n_jobs = -1,
verbose = 2)
grid_search.best_params_
best_rf = RandomForestRegressor(**grid_search.best_params_)
best_rf.fit(train_x, train_y)
preds = best_rf.predict(valid_x)
get_error(valid_y, preds)
We have covered Feature Importance briefly already, but let's take a closer look at this super powerful technique.
def get_feature_importance(model, feature_names, categorical_cols):
# compute feature importances
imp = pd.DataFrame({
'feature': feature_names,
'importance': model.feature_importances_
}).sort_values('importance', ascending = False)
# add feature type
imp['feat_type'] = imp.feature.apply(lambda feat: 'categorical' if feat in categorical_cols else 'continuous')
return imp.sort_values('importance', ascending = False)
imp = get_feature_importance(best_rf, train_x.columns.tolist(), categorical_cols=CAT_COLS)
px.bar(imp, x='feature', y='importance', color='feat_type')
imp.groupby('feat_type').mean()
def kfold_train_eval(df, features, k, model_params, return_last = False, n_iter = 3):
df = df.sample(len(df)).reset_index(drop = True)
x = df[features]
y = df[[TARGET]]
kf = KFold(n_splits = k)
errors = {'rmse': [], 'mae': []}
for train_idx, valid_idx in kf.split(x):
# build train/valid split
train_x, valid_x = x.loc[train_idx], x.loc[valid_idx]
train_y, valid_y = y.loc[train_idx].values.ravel(), y.loc[valid_idx].values.ravel()
for _ in range(n_iter):
# fit & predict
model = RandomForestRegressor(**model_params)
model.fit(train_x, train_y)
preds = model.predict(valid_x)
# record error
errors['rmse'].append(rmse(valid_y, preds))
errors['mae'].append(mae(valid_y, preds))
results = {metric: np.mean(score) for metric, score in errors.items()}
if return_last: return results, model
else: return results
# kfold score before new features
kfold_train_eval(df,
features=CAT_COLS+CONT_COLS,
k = 10,
model_params = rf_random_best_params
)
# add additional features
df['space_per_room'] = df.space / df.rooms
df['years_since_renovation'] = df.renovated_date - df.year_construction
df['renovation_ratio'] = df.renovated_date / df.year_construction
df['energy_efficiency'] = df.energy_requirement / df.space
CONT_COLS += ['space_per_room', 'years_since_renovation', 'renovation_ratio', 'energy_efficiency']
# kfold eval with new features
results, model = kfold_train_eval(
df,
features=CAT_COLS + CONT_COLS,
k=10,
model_params=rf_random_best_params,
return_last=True
)
results
imp = get_feature_importance(model, feature_names=CAT_COLS+CONT_COLS, categorical_cols=CAT_COLS)
px.bar(imp, x = 'feature', y = 'importance', color = 'feat_type')
## Student TODO
So what can we do? We have a few options:
category data type (LightGBM and CatBoost are popular choices)One Hot EncodingMean Target Encodingembedding for each of the categories and use the embeddings as featuresdef target_encode_categorical_features(df, feature, target, encode_type):
df = df.copy()
if encode_type == 'mean':
encoding_map = df.groupby(feature).mean()[target].to_dict()
elif encode_type == 'median':
encoding_map = df.groupby(feature).median()[target].to_dict()
elif encode_type == 'std':
encoding_map = df.groupby(feature).std()[target].to_dict()
else:
raise ValueError('encode_type must be one of the following: "mean", "median", or "std"')
df[f'{feature}_{encode_type}'] = df[feature].apply(lambda category: encoding_map[category])
return df
for c in CAT_COLS:
df = target_encode_categorical_features(df, feature=c, target=TARGET, encode_type='mean')
mean_encoded_cat_cols = [f'{c}_mean' for c in CAT_COLS]
# kfold eval with new features
results, model = kfold_train_eval(
df,
features=CONT_COLS+mean_encoded_cat_cols,
k=10,
model_params=rf_random_best_params,
return_last=True
)
results
imp = get_feature_importance(
model=model,
feature_names=CONT_COLS+mean_encoded_cat_cols,
categorical_cols=mean_encoded_cat_cols
)
px.bar(imp, x = 'feature', y = 'importance', color = 'feat_type')
Median and Standard Deviation Encoding?¶for c in CAT_COLS:
df = target_encode_categorical_features(df, feature=c, target=TARGET, encode_type='median')
df = target_encode_categorical_features(df, feature=c, target=TARGET, encode_type='std')
df.drop(CAT_COLS, axis=1, inplace=True)
CAT_COLS = [f'{c}_{encode_type}' for encode_type in ['mean', 'median', 'std'] for c in CAT_COLS]
for c in CAT_COLS:
if '_std' in c:
df[c] = df[c].fillna(0.)
# kfold eval with new features
results, model = kfold_train_eval(
df,
features=CONT_COLS+CAT_COLS,
k=10,
model_params=rf_random_best_params,
return_last=True
)
results
imp = get_feature_importance(
model=model,
feature_names=CONT_COLS+CAT_COLS,
categorical_cols=CAT_COLS
)
px.bar(imp, x = 'feature', y = 'importance', color = 'feat_type')
def get_best_fit(train_x, train_y, valid_x, valid_y, model_params, fit_iter, best_error=None):
for _ in tqdm(range(fit_iter)):
model = RandomForestRegressor(**model_params)
model.fit(train_x, train_y)
preds = model.predict(valid_x)
error = rmse(valid_y, preds)
if not best_error or error < best_error:
best_error = deepcopy(error)
best_model = deepcopy(model)
print(f'Best error after {fit_iter} iterations: {best_error}')
return best_model
def get_permutation_importance(model, valid_x, valid_y, n_repeats):
permutation_importance_results = permutation_importance(model, valid_x, valid_y, n_repeats=n_repeats)
return pd.DataFrame({
'feature': valid_x.columns,
'importance': permutation_importance_results['importances_mean'],
})
def get_avg_feature_importance(df, features, categorical_cols,
model_params, n_iter, fit_iter, permutation_repeats):
feature_importance_results = []
for _ in range(n_iter):
# rebuild train/valid with new features
train_x, valid_x, train_y, valid_y = get_train_test_split(
df, x_cols=features, target=TARGET, test_size=VALID_SZ, random_state=None)
# find the "best fit" of the model, to be used to find permutation importance (you usually shouldn't do
# this as it is considered overfitting the validation set, but for feature selection it can be helpful)
model = get_best_fit(train_x, train_y, valid_x, valid_y, model_params, fit_iter=fit_iter)
# get regular random forest importance
imp = get_feature_importance(model, feature_names=train_x.columns.tolist(), categorical_cols=categorical_cols)
# get permutation importance using sklearn, this takes a long time but the more "n_repeats" the better the results!
permutation_imp = get_permutation_importance(model, valid_x, valid_y, n_repeats=permutation_repeats)
# combine both importance dfs
imp = imp.merge(permutation_imp, on='feature', suffixes = ('_rf', '_perm'))
# save this result
feature_importance_results.append(deepcopy(imp))
# return the average permutation importance for each feature
imp = deepcopy(pd.concat(feature_importance_results).groupby('feature').agg({
'feat_type': 'first',
'importance_rf': 'mean',
'importance_perm': 'mean',
}).reset_index())
# normalize permuation importance (think of it as the percentage of importance)
imp['importance_perm'] = imp['importance_perm'] / imp['importance_perm'].sum()
# create weighted importance
imp['weighted_imp'] = imp.importance_rf * np.log1p(imp.importance_perm)
imp['weighted_imp'] = imp['weighted_imp'] / imp['weighted_imp'].sum()
# reorg columns and sort by weighted importance
return imp[['feature', 'feat_type', 'importance_rf', 'importance_perm', 'weighted_imp']
].sort_values('weighted_imp', ascending=False).reset_index(drop=True).copy(deep=True)
# # get average permutation importance across {n_iter} folds
# imp = get_avg_feature_importance(
# df=df,
# features=CONT_COLS+CAT_COLS,
# categorical_cols=CAT_COLS,
# model_params=rf_random_best_params,
# n_iter=10,
# fit_iter=100,
# permutation_repeats=100
# )
# imp.to_pickle('feature_importance_w_permutation.pkl')
imp = pd.read_pickle('feature_importance_w_permutation.pkl')
px.bar(imp, x = 'feature', y = 'weighted_imp', color = 'feat_type')
most_important_features = imp[imp.weighted_imp > 0.001].feature.tolist()
kfold_train_eval(df, features=most_important_features, k=10, model_params=rf_random_best_params)
errors = []
ranked_feats = imp.feature.tolist()
for i in tqdm(range(len(ranked_feats), 1, -1)):
param_max_feats = rf_random_best_params['max_features']
results = kfold_train_eval(
df=df,
features=ranked_feats[:i],
k=10,
model_params={**rf_random_best_params, 'max_features': min(param_max_feats, i)}
)
errors.append({
'top_n': i,
'rmse': results['rmse'],
'mae': results['mae'],
})
errors = pd.DataFrame(errors)
px.line(errors, x = 'top_n', y = 'rmse')
OPTIMAL_N_FEATURES = errors.loc[errors['rmse'].argmin(), 'top_n']
optimal_feature_set = ranked_feats[:OPTIMAL_N_FEATURES]
kfold_train_eval(df, features=optimal_feature_set, k=10, model_params=rf_random_best_params)
train_x, valid_x, train_y, valid_y = get_train_test_split(df=df, x_cols=optimal_feature_set,
target=TARGET, test_size=VALID_SZ, random_state=None)
model = get_best_fit(train_x, train_y, valid_x, valid_y, rf_random_best_params, 500)
def plot_partial_dependence(feature):
pdp_space = pdp.pdp_isolate(
model=model,
dataset=valid_x,
model_features=optimal_feature_set,
feature=feature
)
pdp.pdp_plot(pdp_space, feature.title());
plot_partial_dependence('space')
plot_partial_dependence('energy_requirement')
plot_partial_dependence('energy_efficiency')
plot_partial_dependence('rooms')
plot_partial_dependence('year_construction')
plot_partial_dependence('renovated_date')
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(valid_x)
shap.summary_plot(shap_values, valid_x)